from src.data_management.RecSys2019Reader import *
import pandas as pd
import numpy as np
import scipy.sparse as sps
import matplotlib.pyplot as plt
from src.data_management.data_getter import *
from src.plots.plot_evaluation_helper import *
from course_lib.Data_manager.DataReader_utils import merge_ICM
from src.data_management.RecSys2019Reader import RecSys2019Reader
from src.data_management.RecSys2019Reader_utils import merge_UCM
from src.data_management.data_reader import read_target_users, read_URM_cold_all, read_UCM_cold_all
from src.model import best_models
from src.plots.plot_evaluation_helper import plot_popularity_discretized
from src.data_management.data_getter import get_popular_items
from scipy.stats import skew
from src.model import new_best_models
from src.data_management.DataPreprocessing import DataPreprocessingDiscretization, \
DataPreprocessingImputation, DataPreprocessingFeatureEngineering, DataPreprocessingTransform
data_reader = RecSys2019Reader("../../data/")
data_reader.load_data()
URM_all = data_reader.get_URM_all()
ICM_categorical = data_reader.get_ICM_from_name("ICM_sub_class")
UCM_age = data_reader.get_UCM_from_name("UCM_age")
UCM_region = data_reader.get_UCM_from_name("UCM_region")
UCM_all, _ = merge_UCM(UCM_age, UCM_region, {}, {})
This question have not been already explored in the previous notebook of target users analysis.
df_target = pd.read_csv("../data/data_target_users_test.csv")
target_users = df_target.user_id.values
URM_cold_all = read_URM_cold_all("../data/data_train.csv")
target_URM = URM_cold_all[target_users]
user_act = (target_URM > 0).sum(axis=1)
user_act = np.array(user_act).squeeze()
user_act = np.sort(user_act)
threshold_list = [0, 10, 20, 30, 40, 1000]
plot_popularity_discretized(user_act, threshold_list, y_label="Percentage of user popularity")
plt.plot(user_act, 'ro')
plt.xlabel('User index')
plt.ylabel('Number of interactions')
plt.show()
np.array(URM_all.sum(axis=1)).squeeze()[np.array(URM_all.sum(axis=1)).squeeze() > 40].size - user_act[user_act > 40].size
There are 125 warm users (with more than 40 activies) that are not in the test set. Let's see for the one very warm
np.array(URM_all.sum(axis=1)).squeeze()[np.array(URM_all.sum(axis=1)).squeeze() > 140].size - user_act[user_act > 140].size
5 over 55 of the very warm should not be predicted. Who are they? What is their activity? Are they very very warm (i.e. > 140) or only warm (> 40 < 140)?
Taking inspirations from the MAP graphics of many recommenders, we want to study how users with more than 40 interactions are structured.
very_warm_users_mask = np.ediff1d(URM_all.tocsr().indptr) > 40
very_warm_users = np.arange(URM_all.shape[0])[very_warm_users_mask]
very_warm_users.size
We should note first of all that these users never disapper while splitting, since we are taking out very few ratings, therefore, there is no variability in the different splits, in this sense.
First of all, let's recap, the general distribution, and how much different is their number w.r.t. other, less active, users
user_act = (URM_all > 0).sum(axis=1)
user_act = np.array(user_act).squeeze()
user_act = np.sort(user_act)
threshold_list = [0, 10, 20, 30, 40, 1000]
plot_popularity_discretized(user_act, threshold_list, y_label="Percentage of user popularity")
# Among this popular users, what is the distribution?
very_warm_URM = URM_all[very_warm_users_mask]
very_warm_user_act = (very_warm_URM > 0).sum(axis=1)
very_warm_user_act = np.array(very_warm_user_act).squeeze()
very_warm_user_act = np.sort(very_warm_user_act)
plt.plot(user_act)
plt.xlabel('User index')
plt.ylabel('Number of interactions')
plt.show()
As we can see, the distribution is higly skewed, let's try to get rid of these users and divide them.
user_act = (very_warm_URM > 0).sum(axis=1)
user_act = np.array(user_act).squeeze()
user_act = np.sort(user_act)
threshold_list = [40, 70, 100, 140, 190, 1000]
plot_popularity_discretized(user_act, threshold_list, y_label="Percentage of user popularity")
extreme_warm_users_mask = np.ediff1d(URM_all.tocsr().indptr) > 140
extreme_warm_users = np.arange(URM_all.shape[0])[extreme_warm_users_mask]
extreme_warm_URM = URM_all[extreme_warm_users_mask]
mask = np.in1d(very_warm_users, extreme_warm_users, invert=True)
not_too_warm_users = very_warm_users[mask]
temp = very_warm_URM[mask]
temp = (temp > 0).sum(axis=1)
temp = np.array(temp).squeeze()
temp = np.sort(temp)
plt.plot(temp)
plt.xlabel('User index')
plt.ylabel('Number of interactions')
plt.show()
mask = np.in1d(very_warm_users, extreme_warm_users, invert=True)
mask = np.logical_not(mask)
temp = very_warm_URM[mask]
temp = (temp > 0).sum(axis=1)
temp = np.array(temp).squeeze()
temp = np.sort(temp)
plt.plot(temp)
plt.xlabel('User index')
plt.ylabel('Number of interactions')
plt.show()
Let's start from the very outliers: how do item similarity behaves for them? Let's try to see how often they do like popular items, and how often they instead do not prefer them.
def get_pop_proportion(threhsold):
pop_items = get_popular_items(URM_all, threhsold)
quantity_of_popular_items_liked = np.zeros(extreme_warm_users.size)
quantity_of_unpop_items_liked = np.zeros(extreme_warm_users.size)
for i, user in enumerate(extreme_warm_users):
items_liked_by_user = URM_all[user].indices
q_pop = np.in1d(pop_items, items_liked_by_user).sum()
quantity_of_popular_items_liked[i] = q_pop
quantity_of_unpop_items_liked[i] = items_liked_by_user.size - q_pop
return quantity_of_unpop_items_liked, quantity_of_popular_items_liked
quantity_of_unpop_items_liked, quantity_of_popular_items_liked = get_pop_proportion(500)
quantity_of_popular_items_liked
quantity_of_unpop_items_liked
quantity_of_unpop_items_liked, quantity_of_popular_items_liked = get_pop_proportion(1)
quantity_of_unpop_items_liked
Some of them, are the only ones who liked a given item.
quantity_of_unpop_items_liked, quantity_of_popular_items_liked = get_pop_proportion(30)
quantity_of_unpop_items_liked
Maybe, we should focus for instance here, to obtain a more grained information about their taste.
# First let's see how these impact the similarity matrix between users
user_cf = best_models.UserCF.get_model(URM_train=URM_all, load_model=False)
W_sparse = user_cf.W_sparse
W_sparse[:, 55].data.size # This very active users is in the neighboorhood of many users
W_sparse[55].data.size # And, it has 7525 valid similarity values: its taste is related to many users: it may be strange
# Let's look at his top neighboorhood
values = np.sort(W_sparse[55].data)
plt.plot(values, 'ro')
plt.xlabel('User index')
plt.ylabel('Similarity value of user 55 (the most extreme)')
plt.show()
top_50_values = values[-50:]
top_50_similar_users = W_sparse[55].indices[np.isin(W_sparse[55].data, top_50_values)] # top 50 most similar users
count_extreme=0
count_very=0
for user in top_50_similar_users:
if user in extreme_warm_users:
count_extreme+=1
if user in very_warm_users:
count_very+=1
count_extreme
count_very
# Let's look at his top neighboorhood
plt.plot(np.sort(W_sparse[25].data), 'ro')
plt.xlabel('User index')
plt.ylabel('Similarity value of user 25 (with few ratings)')
plt.show()
data_reader = RecSys2019Reader("../data/")
data_reader = DataPreprocessingFeatureEngineering(data_reader,
ICM_names_to_count=["ICM_sub_class"])
data_reader = DataPreprocessingImputation(data_reader,
ICM_name_to_agg_mapper={"ICM_asset": np.median,
"ICM_price": np.median})
data_reader = DataPreprocessingTransform(data_reader,
ICM_name_to_transform_mapper={"ICM_asset": lambda x: np.log1p(1 / x),
"ICM_price": lambda x: np.log1p(1 / x),
"ICM_item_pop": np.log1p,
"ICM_sub_class_count": np.log1p})
data_reader = DataPreprocessingDiscretization(data_reader,
ICM_name_to_bins_mapper={"ICM_asset": 200,
"ICM_price": 200,
"ICM_item_pop": 50,
"ICM_sub_class_count": 50})
data_reader.load_data()
ICM_all = data_reader.get_ICM_from_name("ICM_all")
item_cbf_cf_all = new_best_models.ItemCBF_CF.get_model(URM_train=URM_all, ICM_train=ICM_all)
scores_extreme = item_cbf_cf_all._compute_item_score(user_id_array=extreme_warm_users)
# Let's look at his top neighboorhood
for i in range(0, scores_extreme.shape[0]):
plt.plot(np.sort(scores_extreme[i]), 'ro')
plt.xlabel('Item index')
plt.ylabel('Scores for user {}'.format(extreme_warm_users[i]))
plt.show()
mid_mask = np.ediff1d(URM_all.tocsr().indptr) < 30
mid_users = np.arange(URM_all.shape[0])[mid_mask]
mid_mask = np.ediff1d(URM_all.tocsr().indptr) > 10
mid_users = np.unique(np.intersect1d(mid_users, np.arange(URM_all.shape[0])[mid_mask]))
scores_mid = item_cbf_cf_all._compute_item_score(user_id_array=mid_users)
for i in range(0, scores_mid.shape[0]):
plt.plot(np.sort(scores_mid[i]), 'ro')
plt.xlabel('Item index')
plt.ylabel('Score of users {}'.format(mid_users[i]))
plt.show()
ICM_subclass = data_reader.get_ICM_from_name("ICM_sub_class")
subclass_extreme = []
for user in extreme_warm_users:
liked_item = URM_all[user].indices
subclass_item_liked = ICM_subclass[liked_item].indices
subclass_extreme.append(subclass_item_liked)
From "Identifying Grey Sheep Users By The Distribution of UserSimilarities In Collaborative Filtering" - Zheng et. al
The method is divided on 4 steps:
# STEP 1: REPRESENTING DISTRIBUTION OF SIMILARITIES, FOR EACH USER
user_cf = best_models.UserCF.get_model(URM_train=URM_all, load_model=False)
W_sparse = user_cf.W_sparse
sim_q1 = np.zeros(URM_all.shape[0])
sim_q2 = np.zeros(URM_all.shape[0])
sim_q3 = np.zeros(URM_all.shape[0])
sim_mean = np.zeros(URM_all.shape[0])
sim_std = np.zeros(URM_all.shape[0])
sim_skew = np.zeros(URM_all.shape[0])
for i, user in enumerate(np.arange(URM_all.shape[0])):
if i % 10000 == 0:
print("Done {} over {}".format(i, URM_all.shape[0]))
if W_sparse[i].data.size > 0:
user_similarity = W_sparse[i].data
quantiles = np.quantile(a=W_sparse[i].data, q=[0.25, 0.5, 0.75])
sim_q1[i] = quantiles[0]
sim_q2[i] = quantiles[1]
sim_q3[i] = quantiles[2]
sim_mean[i] = W_sparse[i].data.mean()
sim_std[i] = W_sparse[i].data.std()
sim_skew[i] = skew(W_sparse[i].data)
# STEP 2: EXAMPLE SELECTIONS (here, I choosed to consider only skewness, otherwise, too few users were selected)
# Bad examples selection
empirical_threshold_skew = np.quantile(a=sim_skew, q=0.75)
bad_examples_users_skew = np.argwhere((sim_skew > empirical_threshold_skew) & (sim_mean > 0)).squeeze()
bad_examples = np.intersect1d(bad_examples_users_skew, bad_examples_users_skew)
include_other_info = False
if include_other_info:
empt_t_mean = np.quantile(a=sim_mean, q=0.25)
bad_examples_users_mean = np.argwhere((sim_mean < empt_t_mean) & (sim_mean > 0)).squeeze()
bad_examples = bad_examples_users_mean
empirical_threshold_skew = np.quantile(a=sim_skew, q=0.25)
good_examples_users_skew = np.argwhere((sim_skew < empirical_threshold_skew) & (sim_mean > 0)).squeeze()
good_examples = good_examples_users_skew
# Debug Content
almost_cold_users_mask = np.ediff1d(URM_all.tocsr().indptr) <= 2
almost_cold_users = np.arange(URM_all.shape[0])[almost_cold_users_mask]
very_warm_users_mask = np.ediff1d(URM_all.tocsr().indptr) > 40
very_warm_users = np.arange(URM_all.shape[0])[very_warm_users_mask]
print("There are {} users with less than 3 interactions in the bad examples, which are {}".format(np.in1d(bad_examples, almost_cold_users).sum(), bad_examples.size))
print("There are {} users with less than 3 interactions in the good examples, which are {}".format(np.in1d(good_examples, almost_cold_users).sum(), good_examples.size))
print("There are {} users with more than 40 interactions in the bad examples, which are {}".format(np.in1d(bad_examples, very_warm_users).sum(), bad_examples.size))
print("There are {} users with more than 40 interactions in the good examples, which are {}".format(np.in1d(good_examples, very_warm_users).sum(), good_examples.size))
temp = (URM_all[bad_examples] > 0).sum(axis=1)
temp = np.array(temp).squeeze()
temp = np.sort(temp)
plt.plot(user_act, 'ro')
plt.xlabel('User index')
plt.ylabel('Number of interactions')
plt.show()
sim_skew[extreme_warm_users]
users_to_keep = np.ediff1d(URM_all.tocsr().indptr) > 15
users_to_keep = np.arange(URM_all.shape[0])[users_to_keep]
sim_skew[users_to_keep].mean()
sim_mean[extreme_warm_users]
sim_mean.mean()
As we can see, this users have a mean of the similarity to the similarity with other users which is very low. Skewness, however, seems not be present for their similarity distribution. Compared at least, to the one of other users. What happens, if we take them away while training? Do the scores improves?
sim_mean.std()
However, they do not seems to be really outliers using (mean - 3*std). However, one should note that an ack problem of this method is that it can fails: "This method can fail to detect outliers because the outliers increase the standard deviation. The more extreme the outlier, the more the standard deviation is affected."